\section{Training}
	
	\begin{frame}
		\frametitle{Training}
		\begin{enumerate}
			\item Word-aligned corpus: set of triples $<f,e,\sim>$
			\item Obtain word-alignments (e.g. Giza++)
				\begin{itemize}
					\item Many-to-many alignments
				\end{itemize}
			\item Obtain distribution of possible derivations of each training example
				\begin{itemize}
					\item hypothesized using heuristics
				\end{itemize}
			\item Estimate the phrase translation parameters
		\end{enumerate}
	\end{frame}

	\begin{frame}
		\frametitle{Hypothesizing a Distribution of Possible Derivations (1)}
		\begin{enumerate}
			\item Identify \textit{initial phrase} pairs
				\begin{itemize}
					\item $<f^j_i, e^{j'}_{i'}>$ initial phrase pair of $<f, e \sim>$ iff:
					\begin{enumerate}
						\item $f_k \sim e_{k'}$ for some $k \in [i,j]$ and $k' \in [i',j']$
						\item $f_k \nsim e_{k'}$ for all $k \in [i,j]$ and $k' \not\in [i',j']$
						\item $f_k \nsim e_{k'}$ for all $k \not\in [i,j]$ and $k' \in [i',j']$
					\end{enumerate}
				\end{itemize}
		\end{enumerate}
	\end{frame}

	\begin{frame}
		\frametitle{Hypothesizing a Distribution of Possible Derivations (2)}
		\begin{enumerate}
			\setcounter{enumi}{1}
			\item The set of rules of $<f, e, \sim>$ is the smallest set satisfying:
				\begin{enumerate}
					\item If $<f^j_i, e^{j'}_{i'}>$ is an initial phrase, then $X \rightarrow <f^j_i, e^{j'}_{i'}>						   $ is a rule
					\item If $r = X \rightarrow <\gamma, \alpha>$ is a rule and $<f^j_i, e^{j'}_{i'}>$ is an initial phrase pair such that $\gamma = \gamma_1 f^j_i\gamma_2$ and $\alpha = \alpha_1 e^{j'}_{i'} \alpha_2$ then $X \rightarrow <\sigma_1 X_k \sigma_2, \alpha_1 X_k \alpha_2> $ is a rule, where k is and index not used in r
				\end{enumerate}
		\end{enumerate}
	\end{frame}

	\begin{frame}
		\frametitle{Filtering Grammar (1)}
		\begin{itemize}
			\item Very large number of rules generated
				\begin{itemize}
					\item Slows training and decoding
					\item Creates \textit{spurious ambiguity}: many distinct derivations with the same model feature vectors giving the same translation
				\end{itemize}
			\item Filter grammar!
		\end{itemize}
	\end{frame}
	
	\begin{frame}
		\frametitle{Filtering Grammar (2)}
		\begin{enumerate}
			\item Of a set of multiple initial phrase pairs containing the same set of alignment points, keep only the smallest
			\item Limit length of initial phrases on the French side to 10 and rule length on the French right-hand side to 5
			\item In the subtraction step, the length of $f^j_i$ must be greater than one
				\begin{itemize}
					\item Why create a new rule that is not shorter than the original?
				\end{itemize} 
		\end{enumerate}
	\end{frame}

	\begin{frame}
		\frametitle{Filtering Grammar (3)}
		\begin{enumerate}
			\setcounter{enumi}{3}
			\item Limit number of nonterminals in rules to 2
				\begin{itemize}
					\item Simplifies decoding
					\item Prohibits nonterminals that are adjecent on the French side, which is a major cause of spurious ambiguity
				\end{itemize} 
			\item A rule must have at least one pair of aligned words
				\begin{itemize}
					\item Ensures translation decisions are always based on some lexical evidence
				\end{itemize} 
		\end{enumerate}
	\end{frame}
	
	\begin{frame}
		\frametitle{Estimate Translation Parameters}
		\begin{itemize}
			\item Distribution of possible derivations:
			\begin{itemize}
				\item Distribute weight equally among initial phrase pairs: \\
				  	equally distributed among the extracted rules
			\end{itemize} 
			\item Distribution of derivations treated as observed data
			\item $P(\gamma|\alpha)$ and $P(\alpha|\gamma)$ obtained using relative-frequency estimation
		\end{itemize}
	\end{frame}
	
\section{Decoding}

	\begin{frame}
		\frametitle{Decoding}
		\begin{itemize}
			\item CKY parser with beam search + postprocessor for mapping French derivation to English derivations
			\item Find the \textit{n} best derivations that generate $<f,e>$ from some \textit{e} 
			\item Finding hightest-probability \textit{e} requires an expensive summation over derivations
				\begin{itemize}
					\item English yield of the hightest-probability single derivation
					\item $e\left(\arg\max_{D s.t. f(D) = f} w(D)\right)$
				\end{itemize}
		\end{itemize}
	\end{frame}

	\begin{frame}
		\frametitle{Pruning the Search Space (1)}
		\begin{enumerate}
			\item Discard items that score worse than $\beta$ times the best score in the same cell
				\begin{itemize}
					\item A cell contains all the items standing for X spanning $f^j_i$
				\end{itemize}
			\item Discard items worse than the \textit{b}th best item in the same cell
			\item Prune rules that have the same French side
		\end{enumerate}
	\end{frame}

	\begin{frame}
		\frametitle{Pruning the Search Space (2)}
		\begin{enumerate}
			\setcounter{enumi}{3}
			\item If an item falls outside the beam, then any item that would be generated using a lower-scoring rule or a lower-scoring antecedent item is also assumed to fall outside the beam
			\item Prohibit any X from spanning a substring longer than 10 on the French side
				\begin{itemize}
					\item Remember the length constraint of 10 on initial rules during training?
				\end{itemize}
		\end{enumerate}
	\end{frame}

\section{Results}

	\begin{frame}
		\frametitle{Experiments}
		\begin{itemize}
			\item Mandarin-to-English translation
			\item Baseline: state-of-the-art phrase-based sytem Pharaoh
			\item Trainingset: FBIS corpus (7.2M+9.2M words)
			\item Development set: 2002 NIST MT evaluation test
			\item Test set: 2003 NIST MT test set
			\item Evaluation metric: BLEU
			\item Constituent feature added to the hierarchical model
		\end{itemize}
	\end{frame}

	\begin{frame}
		\frametitle{Results (1)}
		\includegraphics[width=6cm]{resources/extractedRules}
	\end{frame}

	\begin{frame}
		\frametitle{Results (2)}
		\includegraphics[width=10cm]{resources/precisions}\\
		\includegraphics[width=10cm]{resources/featureWeights}
	\end{frame}